Render this script with > sbatch ~/spinal_cord_paper/scripts/Gg_all_int_render.sh
library(Seurat)
library(magrittr)
library(dplyr)
library(Rtsne)
library(RColorBrewer)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(patchwork)
library(cowplot)
library(ggdendro)
the following workflow is based on:
[https://github.com/satijalab/seurat/issues/1500] Cell cycle regression
[https://github.com/satijalab/seurat/issues/1836] General workflow
[https://github.com/satijalab/seurat/issues/2590] Integrate all features (although used in the integration script)
my.samples <- list()
my.samples[[1]] <- readRDS("~/spinal_cord_paper/data/Gg_ctrl_1_seurat_070323.rds")
my.samples[[2]] <- readRDS("~/spinal_cord_paper/data/Gg_ctrl_2_seurat_070323.rds")
my.samples[[3]] <- readRDS("~/spinal_cord_paper/data/Gg_lumb_1_seurat_070323.rds")
my.samples[[4]] <- readRDS("~/spinal_cord_paper/data/Gg_lumb_2_seurat_070323.rds")
my.samples[[5]] <- readRDS("~/spinal_cord_paper/data/Gg_poly_1_seurat_070323.rds")
my.samples[[6]] <- readRDS("~/spinal_cord_paper/data/Gg_poly_2_seurat_070323.rds")
my.samples[[7]] <- readRDS("~/spinal_cord_paper/data/Gg_D05_ctrl_seurat_070323.rds")
my.samples[[8]] <- readRDS("~/spinal_cord_paper/data/Gg_D07_ctrl_seurat_070323.rds")
names(my.samples) <- c("Gg_ctrl_1", "Gg_ctrl_1", "Gg_lumb_1", "Gg_lumb_2", "Gg_poly_1", "Gg_poly_2", "Gg_ctrl_D05", "Gg_ctrl_D07")
# integration features
features <- SelectIntegrationFeatures(
object.list = my.samples,
nfeatures = 3000
)
# prepare integration
my.samples <- PrepSCTIntegration(
object.list = my.samples,
anchor.features = features
)
# find anchors
NT.anchors <- FindIntegrationAnchors(
object.list = my.samples,
normalization.method = "SCT",
anchor.features = features
)
rm(my.samples)
# integrate
my.int <- IntegrateData(
anchorset = NT.anchors,
normalization.method = "SCT"
)
# compute PCA
my.int <- RunPCA(
my.int,
verbose = FALSE
)
# Run tSNE
my.int <- RunTSNE(
my.int,
reduction = "pca",
dims = 1:30
)
saveRDS(my.int, file = paste0("~/spinal_cord_paper/data/Gg_all_seurat_",format(Sys.Date(), "%d%m%y"),".rds"))
my.int <- readRDS("~/spinal_cord_paper/data/Gg_all_seurat_220524.rds")
my.int@project.name <- "Gg_all_int"
PCAPlot(my.int, group.by = "orig.ident")
DimPlot(my.int, reduction = "tsne", group.by = "orig.ident")
DimPlot(my.int, reduction = "tsne", split.by = "orig.ident", ncol = 4)
Use Counts (UMI) and Features (Genes) from the raw data (RNA assay).
meta_int <- my.int@meta.data %>%
mutate(orig.ident = factor(
orig.ident,
levels = c("Gg_D05_ctrl", "Gg_D07_ctrl",
"Gg_ctrl_1", "Gg_ctrl_2",
"Gg_lumb_1", "Gg_lumb_2",
"Gg_poly_1", "Gg_poly_2")))
vln_pm <- ggplot(meta_int, aes(x = orig.ident, y = percent.mt, fill = orig.ident)) +
geom_violin() +
geom_boxplot(width = 0.1) +
theme_cowplot() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45,
vjust = 1,
hjust = 1)
)
vln_nC <- ggplot(meta_int, aes(x = orig.ident, y = nCount_RNA, fill = orig.ident)) +
geom_violin() +
geom_boxplot(width = 0.1) +
theme_cowplot() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45,
vjust = 1,
hjust = 1)
)
vln_cF <- ggplot(meta_int, aes(x = orig.ident, y = nFeature_RNA, fill = orig.ident)) +
geom_violin() +
geom_boxplot(width = 0.1) +
theme_cowplot() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45,
vjust = 1,
hjust = 1)
)
vln_pm + vln_nC + vln_cF
pdf("~/spinal_cord_paper/figures/Supp_Fig_1_Gg_all_vln.pdf", width = 15, height = 8)
vln_pm + vln_nC + vln_cF
# Load the orhtology table
ortho_gg_mm_v102 <- readRDS("~/spinal_cord_paper/data/ortho_gg_mm_v102.rds")
colnames(ortho_gg_mm_v102) <- c(
"GG_gene_ID",
"GG_gene_Name",
"MM_gene_ID",
"MM_gene_Name",
"ortho_conf",
"homolog_type"
)
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. We can
# segregate this list into markers of G2/M phase and markers of S phase
s.genes <- ortho_gg_mm_v102 %>%
dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
dplyr::filter(MM_gene_Name %in% cc.genes$s.genes) %>%
dplyr::arrange(match(MM_gene_Name, cc.genes$s.genes)) %>%
dplyr::pull(GG_gene_ID)
g2m.genes <- ortho_gg_mm_v102 %>%
dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
dplyr::filter(MM_gene_Name %in% cc.genes$g2m.genes) %>%
dplyr::arrange(match(MM_gene_Name, cc.genes$g2m.genes)) %>%
dplyr::pull(GG_gene_ID)
t0 <- Sys.time()
my.int <- CellCycleScoring(
my.int,
s.features = s.genes,
g2m.features = g2m.genes,
set.ident = TRUE
)
paste0("Seurats CC scoring took ", Sys.time() - t0, " seconds to run.")
my.int$CC.Difference.seurat <- my.int$S.Score - my.int$G2M.Score
# view cell cycle scores and phase assignments
head(my.int[[]])
rm(t0, s.genes, g2m.genes)
PCAPlot(my.int, group.by = "Phase")
We rescale the data to regress out the CC.difference
all_genes <- rownames(my.int)
my.int <- ScaleData(
my.int,
features = all_genes,
vars.to.regress = "CC.Difference.seurat"
)
## Regressing out CC.Difference.seurat
## Centering and scaling data matrix
# Run the actual PCA (by default uses var.features of assay)
my.int <- RunPCA(my.int, verbose = F)
PCAPlot(my.int, group.by = "Phase")
# Which dimensions will we choose?
hist(my.int@reductions$pca@stdev^2, breaks = 500)
my.dimensions=1:20
We will run the tSNE. Using the FFT-accelerated Interpolation-based t-SNE (FIt-SNE). We run two, a normal tSNE, and an exaggerated tSNE, where clusters are tighter togheter. This is located under the xtsne name.
# Get the tSNE function
source("~/Software/FIt-SNE-master/fast_tsne.R")
my.tsne = fftRtsne(my.int@reductions$pca@cell.embeddings[,my.dimensions],
max_iter = 1500,
learning_rate = round(dim(my.int)[2]/12),
initialization =(my.int@reductions$pca@cell.embeddings[,1:2]/my.int@reductions$pca@stdev[1])*0.0001,
perplexity_list = c(25, round(dim(my.int)[2]/100)),
fast_tsne_path="~/Software/FIt-SNE-master/bin/fast_tsne")
colnames(my.tsne) = c("tsne_1", "tsne_2")
rownames(my.tsne) = colnames(my.int)
my.int[["tsne"]] = CreateDimReducObject(embeddings = my.tsne, key = "tsne_", assay = DefaultAssay(my.int), global = T)
Here we perform the Louvain-jaccard clustering implemented in Seurat. We can see the tree of clusters, to see how the clusters relate in the PCA space.
We also use the tree, to check if any of the terminal pairs of sisters should be merged. This is determined based on a minimum of 20 DEs between the clusters.
# Find first the nearest neighbors
my.int <- FindNeighbors(object = my.int, dims=my.dimensions, verbose = F)
# Then the actual clusters
my.int <- FindClusters(object = my.int, resolution = 1, verbose = F, random.seed = 42)
# Check the tree of clusters, to see what's the relationship between them
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
plot(Tool(object = my.int, slot = 'BuildClusterTree'))
# We are gonna check for DEs using the non integrated data. We are only gonna test genes that have variability, so we calculate variable genes
my.int <- FindVariableFeatures(my.int, assay = "RNA")
# Only the genes with variability > median
my.HVF <- HVFInfo(my.int, assay = "RNA")
my.HVF <- rownames(my.HVF)[which(my.HVF[,3] > (median(my.HVF[,3])))]
# We check for pairs of clusters, how mayn DEs they have. If less than n, we merge them
keep.check <- T
while (keep.check == T) {
keep.check <- F
# Check the tree of clusters, to see what's the relationship between them
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
# Check only the terminal sisters
to.check = ips::terminalSisters(my.int@tools$BuildClusterTree)
for (i in to.check) {
# DE between the sisters
my.DE <- FindMarkers(my.int, i[1], i[2], test.use = "MAST", latent.vars = c("CC.Difference.seurat"),
min.pct = 0.25, verbose = T, assay = "RNA", features = my.HVF,) %>%
dplyr::filter(abs(avg_log2FC) > 0.5) %>%
dplyr::filter(p_val_adj < 0.05)
# If less than 5, merge, and repeat
if (dim(my.DE)[1] < 5) {
cat(paste0(dim(my.DE)[1], " genes differentially expressed between clusters ",i[1]," and ",i[2]," merging \n"))
my.int <- SetIdent(my.int, cells = WhichCells(my.int, idents = i[2]), value = i[1])
keep.check <- T
}
print(i)
}
}
rm(to.check, my.HVF, my.DE)
# renumber starting from 1
my.ID <- factor(
Idents(my.int),
levels= levels(Idents(my.int))[base::order(as.numeric(levels(Idents(my.int))))])
levels(my.ID) <- seq(length(levels(my.ID)))
Idents(my.int) <- my.ID
my.int[["seurat_clusters"]] <- my.ID
# Check again the clusters
dim1 <- DimPlot(my.int, reduction = "tsne", cols = rainbow(length(levels(my.ID))), label = T, label.size = 5, pt.size = 1)
dim1
Now that we have the final reductions, we’ll choose one and we look at the statistics of the cells.
modplots::mFeaturePlot(my.int,
my.features = c("OLIG2", "SOX9", "TUBB3", "SHH",
"SLC18A3", "GATA2", "PAX2", "TLX3",
"IGFBP7", "HBBA", "IFI30", "MSX2", "PLP1"),
gnames = modplots::gnames)
##
# set and get dim.reduct embeddings
my.reduc <- "tsne"
emb <- data.frame(Embeddings(my.int, my.reduc))
colnames(emb) <- c("reduc_1", "reduc_2")
meta <- my.int[[]] %>%
tibble::rownames_to_column("cell_ID")
my.plots = list()
my.plots[[1]] = ggplot(emb[meta[order(meta$nCount_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="UMI count", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))
my.plots[[2]] = ggplot(emb[meta[order(meta$nFeature_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="gene count", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))
my.plots[[3]] = ggplot(emb[meta[order(meta$nCount_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="UMI count (SCT)", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))
my.plots[[4]] = ggplot(emb[meta[order(meta$nFeature_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="gene count (SCT)", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))
my.plots[[5]] = ggplot(emb[meta[order(meta$percent.mt),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=(sort(meta$percent.mt))), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="log1p MT percent", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))
my.plots[[6]] = ggplot(emb[meta[order(meta$CC.Difference.seurat),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$CC.Difference.seurat)), size=1, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="Cell Cycle\nS-G2M", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))
my.plots[[7]] = ggplot(emb[meta[order(meta$percent.rb),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
geom_point(aes(color=sort(meta$percent.rb)), size=2, alpha = 0.4, pch = 19) +
scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
theme_classic() + labs(colour="percent.rb")
grid.arrange(grobs=my.plots, ncol=2)
Here we do the differential expression analysis, and end up with the marker genes lists. We can also see the marker gene dot plot, for the top 2 marker genes per cluster
# Find all the marker genes, with these thresholds MAST
semarkers = FindAllMarkers(my.int,
features = my.int[["integrated"]]@var.features,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.5,
latent.vars = c("CC.Difference.seurat"),
test.use = "MAST",
assay = "RNA",
return.thresh = 0.05)
# We only keep the significant ones
semarkers <- semarkers %>%
filter(p_val_adj < 0.05) %>%
rename(Gene.stable.ID = gene) %>%
left_join(modplots::gnames, by = "Gene.stable.ID")
# Take only the top 10
semrk10 = semarkers %>% group_by(cluster) %>% top_n(-10, p_val_adj)
semrk1 = semarkers %>% group_by(cluster) %>% top_n(-1, p_val_adj)
modplots::mDotPlot2(my.int,
features = unique(semrk1$Gene.stable.ID),
cols = c("grey", "black"),
gnames = modplots::gnames, dot.scale = 6) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
saveRDS(
my.int,
paste0(
"~/spinal_cord_paper/data/",
my.int@project.name,
"_seurat_",
format(Sys.Date(), "%d%m%y"),
".rds"
)
)
write.table(
semarkers,
sep = "\t",
row.names = T,
col.names = T,
file = paste0(
"~/spinal_cord_paper/data/",
my.int@project.name,
"_fullDE_",
format(Sys.Date(), "%d%m%y"),
".txt"
),
quote = F
)
write.table(
semrk10,
sep = "\t",
row.names = T,
col.names = T,
file = paste0(
"~/spinal_cord_paper/data/",
my.int@project.name,
"_top10DE_",
format(Sys.Date(), "%d%m%y"),
".txt"
),
quote = F
)
# Date and time of Rendering
Sys.time()
## [1] "2024-05-27 19:06:33 CEST"
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /scicore/soft/apps/OpenBLAS/0.3.1-GCC-7.3.0-2.30/lib/libopenblas_sandybridgep-r0.3.1.so
##
## locale:
## [1] en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggdendro_0.1.22 cowplot_1.1.1 patchwork_1.1.1 gridExtra_2.3
## [5] ggplot2_3.3.3 tidyr_1.1.3 RColorBrewer_1.1-2 Rtsne_0.15
## [9] dplyr_1.0.10 magrittr_2.0.1 SeuratObject_4.0.2 Seurat_4.0.5
##
## loaded via a namespace (and not attached):
## [1] fastmatch_1.1-0 plyr_1.8.6
## [3] igraph_1.2.6 lazyeval_0.2.2
## [5] sp_1.4-5 splines_4.1.0
## [7] listenv_0.8.0 scattermore_0.7
## [9] GenomeInfoDb_1.28.0 digest_0.6.27
## [11] htmltools_0.5.1.1 ips_0.0.11
## [13] fansi_0.5.0 memoise_2.0.0
## [15] tensor_1.5 cluster_2.1.2
## [17] ROCR_1.0-11 Biostrings_2.60.0
## [19] globals_0.16.2 matrixStats_0.58.0
## [21] modplots_1.0.0 spatstat.sparse_3.0-0
## [23] prettyunits_1.1.1 colorspace_2.0-1
## [25] blob_1.2.1 ggrepel_0.9.1
## [27] xfun_0.34 crayon_1.4.1
## [29] RCurl_1.98-1.3 jsonlite_1.7.2
## [31] spatstat.data_3.0-0 survival_3.2-11
## [33] zoo_1.8-9 phangorn_2.7.0
## [35] ape_5.5 glue_1.6.2
## [37] polyclip_1.10-0 gtable_0.3.0
## [39] zlibbioc_1.38.0 XVector_0.32.0
## [41] leiden_0.3.9 DelayedArray_0.18.0
## [43] future.apply_1.7.0 SingleCellExperiment_1.14.1
## [45] BiocGenerics_0.38.0 abind_1.4-5
## [47] scales_1.1.1 pheatmap_1.0.12
## [49] DBI_1.1.1 miniUI_0.1.1.1
## [51] Rcpp_1.0.7 progress_1.2.2
## [53] viridisLite_0.4.0 xtable_1.8-4
## [55] reticulate_1.22 spatstat.core_2.1-2
## [57] bit_4.0.4 stats4_4.1.0
## [59] htmlwidgets_1.5.3 httr_1.4.2
## [61] ellipsis_0.3.2 ica_1.0-2
## [63] pkgconfig_2.0.3 XML_3.99-0.6
## [65] farver_2.1.0 sass_0.4.0
## [67] uwot_0.1.10 deldir_1.0-6
## [69] utf8_1.2.1 AnnotationDbi_1.54.0
## [71] tidyselect_1.2.0 labeling_0.4.2
## [73] rlang_1.0.6 reshape2_1.4.4
## [75] later_1.2.0 cachem_1.0.5
## [77] munsell_0.5.0 tools_4.1.0
## [79] cli_3.4.1 RSQLite_2.2.7
## [81] generics_0.1.3 ggridges_0.5.3
## [83] org.Gg.eg.db_3.13.0 evaluate_0.20
## [85] stringr_1.4.0 fastmap_1.1.0
## [87] yaml_2.2.1 goftest_1.2-2
## [89] bit64_4.0.5 knitr_1.41
## [91] fitdistrplus_1.1-6 purrr_0.3.4
## [93] RANN_2.6.1 KEGGREST_1.32.0
## [95] pbapply_1.4-3 future_1.30.0
## [97] nlme_3.1-152 mime_0.10
## [99] compiler_4.1.0 plotly_4.10.0
## [101] png_0.1-7 spatstat.utils_3.0-1
## [103] tibble_3.1.8 bslib_0.2.5.1
## [105] stringi_1.6.2 highr_0.9
## [107] lattice_0.20-44 Matrix_1.3-3
## [109] vctrs_0.5.1 pillar_1.8.1
## [111] lifecycle_1.0.3 spatstat.geom_3.0-3
## [113] lmtest_0.9-38 jquerylib_0.1.4
## [115] RcppAnnoy_0.0.19 data.table_1.14.0
## [117] bitops_1.0-7 irlba_2.3.3
## [119] GenomicRanges_1.44.0 httpuv_1.6.1
## [121] R6_2.5.0 promises_1.2.0.1
## [123] KernSmooth_2.23-20 IRanges_2.26.0
## [125] parallelly_1.33.0 codetools_0.2-18
## [127] MASS_7.3-54 assertthat_0.2.1
## [129] SummarizedExperiment_1.22.0 MAST_1.18.0
## [131] withr_2.4.2 sctransform_0.3.3
## [133] GenomeInfoDbData_1.2.6 S4Vectors_0.30.0
## [135] hms_1.1.0 mgcv_1.8-35
## [137] parallel_4.1.0 quadprog_1.5-8
## [139] grid_4.1.0 rpart_4.1-15
## [141] rmarkdown_2.17 MatrixGenerics_1.4.0
## [143] Biobase_2.52.0 shiny_1.6.0